Raw data

Loading packages

library(data.table)
library(rmarkdown)
library(AnnotationHub)
library(tidyverse)
library(tximport)
library(ggplot2)
library(DESeq2)
library(pheatmap)
library(gridExtra)
library(ggplotify)
library(ggrepel)
library(UpSetR)

Setting AnnotationHub

Assign your species of interest

AnnotationSpecies <- "Homo sapiens"  # Assign your species 
ah <- AnnotationHub(hub=getAnnotationHubOption("URL"))   # Bring annotation DB

Running AnnotationHub

ahQuery <- query(ah, c("OrgDb", AnnotationSpecies))      # Filter annotation of interest
if (length(ahQuery) == 1) {
    DBName <- names(ahQuery)
} else if (length(ahQuery) > 1) {
               DBName <- names(ahQuery)[1]
} else {
    print("You don't have a valid DB")
    rmarkdown::render() 
} 
AnnoDb <- ah[[DBName]] # Store into an OrgDb object  
# Explore your OrgDb object with following accessors:
# columns(AnnpDb)
# keytypes(AnnoDb)
# keys(AnnoDb, keytype=..)
# select(AnnoDb, keys=.., columns=.., keytype=...)
AnnoKey <- keys(AnnoDb, keytype="ENSEMBLTRANS")
# Note: Annotation has to be done with not genome but transcripts 
AnnoDb <- select(AnnoDb, 
                 AnnoKey,
                 keytype="ENSEMBLTRANS",
                 columns="SYMBOL")

Checking out the AnnotationHub output

# Check if your AnnoDb has been extracted and saved correctely
class(AnnoDb)
## [1] "data.frame"
head(AnnoDb)
##      ENSEMBLTRANS SYMBOL
## 1 ENST00000543404  A2MP1
## 2 ENST00000566278  A2MP1
## 3 ENST00000545343  A2MP1
## 4 ENST00000544183  A2MP1
## 5 ENST00000286479   NAT2
## 6 ENST00000520116   NAT2

Defining file name and path for .sf files

.sf files have been created from fastq data by salmon

# This code chunk needs to be written by yourself 
# Define sample names 
SampleNames <-  c("Mock_72hpi_S1",
                 "Mock_72hpi_S2",
                 "Mock_72hpi_S3",
                 "SARS-CoV-2_72hpi_S7",
                 "SARS-CoV-2_72hpi_S8",
                 "SARS-CoV-2_72hpi_S9") 
# Define group level
GroupLevel <- c("Mock", "COVID")
# Define contrast for DE analysis
Contrast <- c("Group", "COVID", "Mock")
# Define a vector for comparing TPM vs Counts effect 
TvC <- c("TPM", "Counts")
# Define .sf file path
sf <- c(paste0(SampleNames,
               ".fastq.gz.salmon_quant/quant.sf"))
# Define sample groups
group <- c(rep("Mock", 3), rep("COVID", 3))
# Create metadata
metadata <- data.frame(Sample=factor(SampleNames, levels=SampleNames),
                       Group=factor(group, levels=GroupLevel),
                       Path=sf)
rownames(metadata) <- SampleNames
# Explore the metadata
print(metadata)
##                                  Sample Group
## Mock_72hpi_S1             Mock_72hpi_S1  Mock
## Mock_72hpi_S2             Mock_72hpi_S2  Mock
## Mock_72hpi_S3             Mock_72hpi_S3  Mock
## SARS-CoV-2_72hpi_S7 SARS-CoV-2_72hpi_S7 COVID
## SARS-CoV-2_72hpi_S8 SARS-CoV-2_72hpi_S8 COVID
## SARS-CoV-2_72hpi_S9 SARS-CoV-2_72hpi_S9 COVID
##                                                                   Path
## Mock_72hpi_S1             Mock_72hpi_S1.fastq.gz.salmon_quant/quant.sf
## Mock_72hpi_S2             Mock_72hpi_S2.fastq.gz.salmon_quant/quant.sf
## Mock_72hpi_S3             Mock_72hpi_S3.fastq.gz.salmon_quant/quant.sf
## SARS-CoV-2_72hpi_S7 SARS-CoV-2_72hpi_S7.fastq.gz.salmon_quant/quant.sf
## SARS-CoV-2_72hpi_S8 SARS-CoV-2_72hpi_S8.fastq.gz.salmon_quant/quant.sf
## SARS-CoV-2_72hpi_S9 SARS-CoV-2_72hpi_S9.fastq.gz.salmon_quant/quant.sf

Converting .sf files to txi list

# Assign sample names to the input (.sf) file path
names(sf) <- SampleNames
# Run tximport
# tpm vs original counts
# input sf: a factor of all .sf files' path
txi_tpm <- tximport(sf, 
                    type="salmon",
                    tx2gene=AnnoDb,
                    countsFromAbundance="lengthScaledTPM", # Extracts TPM 
                    ignoreTxVersion=T) 
txi_counts <- tximport(sf, 
                    type="salmon",
                    tx2gene=AnnoDb,
                    ignoreTxVersion=T) 

Exploring the txi outputs

# tpm 
head(txi_tpm$counts)
##         Mock_72hpi_S1 Mock_72hpi_S2 Mock_72hpi_S3 SARS-CoV-2_72hpi_S7
## A2MP1        5.705442      5.571614     0.8060672             0.00000
## A3GALT2      0.000000      0.000000     0.0000000             0.00000
## A4GNT        0.000000      2.953081     0.9964319             0.00000
## AACSP1      14.635652     19.646913     9.0911703            19.54913
## AADACL2      0.000000      0.000000     0.0000000             0.00000
## AADACL4      1.001855      0.000000     1.9900495             0.00000
##         SARS-CoV-2_72hpi_S8 SARS-CoV-2_72hpi_S9
## A2MP1              0.000000            7.265574
## A3GALT2            0.000000            0.000000
## A4GNT              0.000000            1.009116
## AACSP1            18.772579            4.560492
## AADACL2            0.000000            0.000000
## AADACL4            2.009001            0.000000
dim(txi_tpm$counts)
## [1] 11013     6
# counts
head(txi_counts$counts)
##         Mock_72hpi_S1 Mock_72hpi_S2 Mock_72hpi_S3 SARS-CoV-2_72hpi_S7
## A2MP1               7             7             1                   0
## A3GALT2             0             0             0                   0
## A4GNT               0             3             1                   0
## AACSP1             16            22            10                  21
## AADACL2             0             0             0                   0
## AADACL4             1             0             2                   0
##         SARS-CoV-2_72hpi_S8 SARS-CoV-2_72hpi_S9
## A2MP1                     0                   3
## A3GALT2                   0                   0
## A4GNT                     0                   1
## AACSP1                   10                   5
## AADACL2                   0                   0
## AADACL4                   2                   0
dim(txi_counts$counts)
## [1] 11013     6

Creating DESeq objects from txi and VST

-References: DESeq2 doc “Transcript abundance files”, DESeq2 doc “Variance stabilizing transformation”

# Set a function creating dds and vsd
dds_vsd_fn <- function(txi) { 
    # Create a DESeq object (so-calledd dds) 
    des <- DESeqDataSetFromTximport(txi, 
                                    colData=metadata,
                                    design=~Group)
    # Create a vsd object (so-called vsd) 
    ves <- vst(des, blind=T)
    # Output them as a list 
    return(list(dds=des, vsd=ves))
}
TPM <- dds_vsd_fn(txi_tpm)
Counts <- dds_vsd_fn(txi_counts)
# Outputs
# dds: TPM/Counts[[1]] or TPM/Counts[['dds']] 
# vsd: TPM/Counts[[2]] or TPM/Counts[['vsd']]

Exploring created dds

# TPM 
TPM[[1]]
## class: DESeqDataSet 
## dim: 11013 6 
## metadata(1): version
## assays(1): counts
## rownames(11013): A2MP1 A3GALT2 ... ZXDA ZXDB
## rowData names(0):
## colnames(6): Mock_72hpi_S1 Mock_72hpi_S2 ... SARS-CoV-2_72hpi_S8
##   SARS-CoV-2_72hpi_S9
## colData names(3): Sample Group Path
head(counts(TPM[[1]]))
##         Mock_72hpi_S1 Mock_72hpi_S2 Mock_72hpi_S3 SARS-CoV-2_72hpi_S7
## A2MP1               6             6             1                   0
## A3GALT2             0             0             0                   0
## A4GNT               0             3             1                   0
## AACSP1             15            20             9                  20
## AADACL2             0             0             0                   0
## AADACL4             1             0             2                   0
##         SARS-CoV-2_72hpi_S8 SARS-CoV-2_72hpi_S9
## A2MP1                     0                   7
## A3GALT2                   0                   0
## A4GNT                     0                   1
## AACSP1                   19                   5
## AADACL2                   0                   0
## AADACL4                   2                   0
# Counts
Counts[[1]]
## class: DESeqDataSet 
## dim: 11013 6 
## metadata(1): version
## assays(2): counts avgTxLength
## rownames(11013): A2MP1 A3GALT2 ... ZXDA ZXDB
## rowData names(0):
## colnames(6): Mock_72hpi_S1 Mock_72hpi_S2 ... SARS-CoV-2_72hpi_S8
##   SARS-CoV-2_72hpi_S9
## colData names(3): Sample Group Path
head(counts(Counts[[1]]))
##         Mock_72hpi_S1 Mock_72hpi_S2 Mock_72hpi_S3 SARS-CoV-2_72hpi_S7
## A2MP1               7             7             1                   0
## A3GALT2             0             0             0                   0
## A4GNT               0             3             1                   0
## AACSP1             16            22            10                  21
## AADACL2             0             0             0                   0
## AADACL4             1             0             2                   0
##         SARS-CoV-2_72hpi_S8 SARS-CoV-2_72hpi_S9
## A2MP1                     0                   3
## A3GALT2                   0                   0
## A4GNT                     0                   1
## AACSP1                   10                   5
## AADACL2                   0                   0
## AADACL4                   2                   0

Exploring created vsd

# TPM
TPM[[2]]
## class: DESeqTransform 
## dim: 11013 6 
## metadata(1): version
## assays(1): ''
## rownames(11013): A2MP1 A3GALT2 ... ZXDA ZXDB
## rowData names(4): baseMean baseVar allZero dispFit
## colnames(6): Mock_72hpi_S1 Mock_72hpi_S2 ... SARS-CoV-2_72hpi_S8
##   SARS-CoV-2_72hpi_S9
## colData names(4): Sample Group Path sizeFactor
# Counts
Counts[[2]]
## class: DESeqTransform 
## dim: 11013 6 
## metadata(1): version
## assays(1): ''
## rownames(11013): A2MP1 A3GALT2 ... ZXDA ZXDB
## rowData names(4): baseMean baseVar allZero dispFit
## colnames(6): Mock_72hpi_S1 Mock_72hpi_S2 ... SARS-CoV-2_72hpi_S8
##   SARS-CoV-2_72hpi_S9
## colData names(3): Sample Group Path

Estimating size factors, dispersions, and conducting Wald Test

# Set a function estimating size factors, dispersions, and perform wald test
DESeqPrep_fn <- function(List) {
    
    List[[1]] <- estimateSizeFactors(List[[1]])
    List[[1]] <- estimateDispersions(List[[1]])
    List[[1]] <- nbinomWaldTest(List[[1]])
   
    return(List)
}
# Update dds with the function
Counts <- DESeqPrep_fn(Counts) 
TPM <- DESeqPrep_fn(TPM)

Exploring size factors

sizeFactors(Counts[[1]])
## NULL
sizeFactors(TPM[[1]])
##       Mock_72hpi_S1       Mock_72hpi_S2       Mock_72hpi_S3 SARS-CoV-2_72hpi_S7 
##           1.0991742           1.0489678           0.8749073           1.3751270 
## SARS-CoV-2_72hpi_S8 SARS-CoV-2_72hpi_S9 
##           0.7673324           1.0049385
# Size factors don't exist in the Counts dds!
# Normalization factors are calculated in the Counts dds instead! 
assays(Counts[[1]])
## List of length 6
## names(6): counts avgTxLength normalizationFactors mu H cooks
assays(TPM[[1]])
## List of length 4
## names(4): counts mu H cooks
colData(Counts[[1]])
## DataFrame with 6 rows and 3 columns
##                                  Sample    Group                   Path
##                                <factor> <factor>            <character>
## Mock_72hpi_S1       Mock_72hpi_S1          Mock  Mock_72hpi_S1.fastq...
## Mock_72hpi_S2       Mock_72hpi_S2          Mock  Mock_72hpi_S2.fastq...
## Mock_72hpi_S3       Mock_72hpi_S3          Mock  Mock_72hpi_S3.fastq...
## SARS-CoV-2_72hpi_S7 SARS-CoV-2_72hpi_S7    COVID SARS-CoV-2_72hpi_S7...
## SARS-CoV-2_72hpi_S8 SARS-CoV-2_72hpi_S8    COVID SARS-CoV-2_72hpi_S8...
## SARS-CoV-2_72hpi_S9 SARS-CoV-2_72hpi_S9    COVID SARS-CoV-2_72hpi_S9...
colData(TPM[[1]])
## DataFrame with 6 rows and 4 columns
##                                  Sample    Group                   Path
##                                <factor> <factor>            <character>
## Mock_72hpi_S1       Mock_72hpi_S1          Mock  Mock_72hpi_S1.fastq...
## Mock_72hpi_S2       Mock_72hpi_S2          Mock  Mock_72hpi_S2.fastq...
## Mock_72hpi_S3       Mock_72hpi_S3          Mock  Mock_72hpi_S3.fastq...
## SARS-CoV-2_72hpi_S7 SARS-CoV-2_72hpi_S7    COVID SARS-CoV-2_72hpi_S7...
## SARS-CoV-2_72hpi_S8 SARS-CoV-2_72hpi_S8    COVID SARS-CoV-2_72hpi_S8...
## SARS-CoV-2_72hpi_S9 SARS-CoV-2_72hpi_S9    COVID SARS-CoV-2_72hpi_S9...
##                     sizeFactor
##                      <numeric>
## Mock_72hpi_S1         1.099174
## Mock_72hpi_S2         1.048968
## Mock_72hpi_S3         0.874907
## SARS-CoV-2_72hpi_S7   1.375127
## SARS-CoV-2_72hpi_S8   0.767332
## SARS-CoV-2_72hpi_S9   1.004939

Plotting the size factors in TPM

# Extract and save the size factors in a data frame
sizeFactor <- as.data.frame(round(sizeFactors(TPM[[1]]), 3))
colnames(sizeFactor) <- 'Size_Factor'
sizeFactor <- sizeFactor %>%
    rownames_to_column(var="Sample") %>%
    inner_join(metadata[, 1:ncol(metadata)-1], by="Sample") 
# Create a plot comparing the size factors by sample
ggplot(sizeFactor, aes(x=Sample, 
                       y=Size_Factor, 
                       fill=Group,
                       label=Size_Factor)) +
    geom_bar(stat="identity", width=0.8) +
    theme_bw() + 
    ggtitle("Size Factors in TPM-DESeq") +
    geom_text(vjust=1.5) +
    theme(axis.text.x=element_text(angle=45, 
                                   vjust=0.5)) + ylab("Size Factor")

Plotting nornalization factors in the Counts

# Extract and normalization factors in a data frame
normf <- as.data.frame(normalizationFactors(Counts[[1]])) %>%
    gather(Sample, Normalization_Factor) %>%
    inner_join(metadata[, 1:2], by="Sample") 
normf$Sample <- factor(normf$Sample, levels=SampleNames)
normf$Group <- factor(normf$Group, levels=GroupLevel)
# Create a scatter plot showing distribution of normalization factors 
normFactors_plot <- ggplot(normf, 
       aes(x=Sample, y=Normalization_Factor)) + 
geom_jitter(alpha=0.5, aes(color=Group)) + 
# Add a boxplot to provide statistics in each sample
geom_boxplot(aes(x=Sample, y=Normalization_Factor), 
             outlier.shape=NA) + 
theme_bw() +
ggtitle("Normalization Factors in Counts-DESeq") +
theme(axis.text.x=element_text(angle=45, 
                               vjust=0.5)) + 
ylab("Normalization Factor / Gene") +
# Add a dashed horizontal line to indicate where normalization factor=1
geom_hline(yintercept=1, 
           color="blue", 
           linetype="dashed")
# Print the normalization factor scatter plot 
print(normFactors_plot)

# Print the same plot with larger y-magnification in order to observe the box plot 
normFactors_plot + 
    ylim(0.5, 1.5)

Results & Discussion 1:

- Patterns of the size factors (from TPM) and the normalization factors per gene (from Counts) by sample were similar in terms of magnitude

Setting functions for QC plots

# Assigne what to compare
GroupOfInterest <- Contrast[1] 
# Set a function for a PCA plot
QCPCA_fn <- function(inputList, Title) {
    plotPCA(inputList[[2]],    # takes vsd
            intgroup=GroupOfInterest) + theme_bw() + ggtitle(Title)
}
# Set heatmap annotation 
ColOfInterest <- !colnames(metadata) %in% c("Sample", "Path")
HeatmapAnno <- as.data.frame(metadata[, ColOfInterest])
rownames(HeatmapAnno) <- SampleNames
colnames(HeatmapAnno) <- colnames(metadata)[ColOfInterest]
# Set a function for a correlation heatmap 
QCcorrHeatmap_fn <- function(inputList, Title) {
    # Extract transformed count matrix
    mtx <- assay(inputList[[2]])      # takes vsd
    # Calculate correlation and store in the matrix
    mtx <- cor(mtx)
    
    
    # Create a correlation heatmap
    return(pheatmap(mtx, 
             annotation=HeatmapAnno,
             main=paste("Sample Correlation Heatmap:",
                        Title)))
}

Sample QC: Principal Component Analysis

grid.arrange(QCPCA_fn(TPM, "QC PCA: TPM"), 
             QCPCA_fn(Counts, "QC PCA: Counts"), 
             nrow=2)

Sample QC: Sample Correlation Heatmap

# TPM
QCcorrHeatmap_fn(TPM, "TPM") 

# Counts
QCcorrHeatmap_fn(Counts, "Counts") 

Results & Discussion 2:

- Unsupervised learning-mediated sample QC resulted in highly similar patterns between TPM and Counts

Running DE analysis

# Create a list for TPM and Counts dds 
ddsList <- list(TPM=TPM[[1]], Counts=Counts[[1]]) 
for (x in TvC) {
    # Run DESeq() 
    ddsList[[x]] <- DESeq(ddsList[[x]])
    print(resultsNames(ddsList[[x]]))
}
## [1] "Intercept"           "Group_COVID_vs_Mock"
## [1] "Intercept"           "Group_COVID_vs_Mock"

Creating dispersion plots

# Plot dispersion  
for (x in TvC) {
    plotDispEsts(ddsList[[x]], 
                 ylab="Dispersion", 
                 xlab="Mean of Normalized Counts", 
                 main=paste("Dispersion of", x, "Input"))
}

Results & Discussion 3:

- Both inputs gave a goot fit

- No major difference was seen in the dispersion plots between two conditions

Performing shrinkage

# Create an empty list for shrunken dds
shr_ddsList <- list(TPM=c(), Counts=c()) 
for (x in TvC) {
    # shrink
    shr_ddsList[[x]] <- lfcShrink(ddsList[[x]], 
                                  contrast=Contrast, # contrast  
                                  type="normal")     # is paired with "normal" type
}

Extracting log2FoldChange and p-values with or without shrinkage

# Set FDR threshold 
alpha=0.1 
# FDR threshold vector
FDRv=c("< 0.1", "> 0.1") 
# Set a function cleaning table
Sig_fn <- function(df, Input) {
    df <- df %>% 
        rownames_to_column(var="Gene") %>%
        mutate(FDR=ifelse(padj < 0.1 & !is.na(padj), 
                                   FDRv[1], 
                                   FDRv[2]), 
               Input=Input) 
    return(df)
}
# Initialize lists of result tables with (resList) or without (shr_resList) shrinkage
resList <- ddsList 
shr_resList <- ddsList  
for (x in TvC) {
    # Extract results
    resList[[x]] <- as.data.frame(results(ddsList[[x]], 
                                          contrast=Contrast, 
                                          alpha=alpha))
    shr_resList[[x]] <- as.data.frame(shr_ddsList[[x]])
    # clean the data frame
    resList[[x]] <- Sig_fn(resList[[x]], x)
    shr_resList[[x]] <- Sig_fn(shr_resList[[x]], x)
    
}

Exploratory data analysis of the extracted log2FoldChange tables

# No shrinkage summary
summary(resList)
##        Length Class      Mode
## TPM    9      data.frame list
## Counts 9      data.frame list
head(resList[['TPM']])
##      Gene   baseMean log2FoldChange     lfcSE        stat    pvalue      padj
## 1   A2MP1  3.2145217    -0.83018949 1.7557571 -0.47283846 0.6363284        NA
## 2 A3GALT2  0.0000000             NA        NA          NA        NA        NA
## 3   A4GNT  0.8330031    -1.92416012 3.0773846 -0.62525826 0.5318016        NA
## 4  AACSP1 14.5467370     0.02184858 0.7549595  0.02894006 0.9769124 0.9923554
## 5 AADACL2  0.0000000             NA        NA          NA        NA        NA
## 6 AADACL4  0.9670271    -0.35981582 2.9306060 -0.12277864 0.9022824        NA
##     FDR Input
## 1 > 0.1   TPM
## 2 > 0.1   TPM
## 3 > 0.1   TPM
## 4 > 0.1   TPM
## 5 > 0.1   TPM
## 6 > 0.1   TPM
head(resList[['Counts']])
##      Gene   baseMean log2FoldChange    lfcSE        stat    pvalue      padj
## 1   A2MP1  2.9604793    -0.90913229 1.813261 -0.50137976 0.6161039        NA
## 2 A3GALT2  0.0000000             NA       NA          NA        NA        NA
## 3   A4GNT  0.8386688    -1.90512920 3.007858 -0.63338401 0.5264829        NA
## 4  AACSP1 14.0360149    -0.06225852 0.760773 -0.08183586 0.9347772 0.9796733
## 5 AADACL2  0.0000000             NA       NA          NA        NA        NA
## 6 AADACL4  0.9793046    -0.35701152 2.864781 -0.12462089 0.9008237        NA
##     FDR  Input
## 1 > 0.1 Counts
## 2 > 0.1 Counts
## 3 > 0.1 Counts
## 4 > 0.1 Counts
## 5 > 0.1 Counts
## 6 > 0.1 Counts
# Shrinkage summary
summary(shr_resList)
##        Length Class      Mode
## TPM    9      data.frame list
## Counts 9      data.frame list
head(shr_resList[['TPM']])
##      Gene   baseMean log2FoldChange     lfcSE        stat    pvalue      padj
## 1   A2MP1  3.2145217     -0.1588907 0.3391943 -0.47283846 0.6363284        NA
## 2 A3GALT2  0.0000000             NA        NA          NA        NA        NA
## 3   A4GNT  0.8330031     -0.1303038 0.2238159 -0.62525826 0.5318016        NA
## 4  AACSP1 14.5467370      0.0122880 0.4253130  0.02894006 0.9769124 0.9923554
## 5 AADACL2  0.0000000             NA        NA          NA        NA        NA
## 6 AADACL4  0.9670271     -0.0275875 0.2314025 -0.12277864 0.9022824        NA
##     FDR Input
## 1 > 0.1   TPM
## 2 > 0.1   TPM
## 3 > 0.1   TPM
## 4 > 0.1   TPM
## 5 > 0.1   TPM
## 6 > 0.1   TPM
head(shr_resList[['Counts']])
##      Gene   baseMean log2FoldChange     lfcSE        stat    pvalue      padj
## 1   A2MP1  2.9604793    -0.15661253 0.3347863 -0.50137976 0.6161039        NA
## 2 A3GALT2  0.0000000             NA        NA          NA        NA        NA
## 3   A4GNT  0.8386688    -0.13471856 0.2283621 -0.63338401 0.5264829        NA
## 4  AACSP1 14.0360149    -0.03454845 0.4258173 -0.08183586 0.9347772 0.9796733
## 5 AADACL2  0.0000000             NA        NA          NA        NA        NA
## 6 AADACL4  0.9793046    -0.02852632 0.2359889 -0.12462089 0.9008237        NA
##     FDR  Input
## 1 > 0.1 Counts
## 2 > 0.1 Counts
## 3 > 0.1 Counts
## 4 > 0.1 Counts
## 5 > 0.1 Counts
## 6 > 0.1 Counts

Exploring mean-difference relationship with MA plots

# Set ylim: has to adjusted by users depending on data 
yl <- c(-20, 20)
# Set min log2 fold change of interest 
mLog <- c(-1, 1)
# Define a function creating an MA plot
MA_fn <- function(List, Shr) {
    MAList <- ddsList 
    for (i in 1:2) {
        MAplot <- ggplot(List[[i]], 
                         aes(x=baseMean,
                             y=log2FoldChange,
                             color=FDR)) + geom_point() + scale_x_log10() + theme_bw() + scale_color_manual(values=c("blue", "grey")) + ggtitle(paste("MA plot:", names(List)[i], "Input with", Shr)) + ylim(yl[1], yl[2])+ geom_hline(yintercept=c(mLog[1], mLog[2]), linetype="dashed", color="red")
        MAList[[i]] <- MAplot
    }
    return(MAList)
}
# Create MA plots with or without shrinkage and store in a list
MA <- MA_fn(resList, "No Shrinkage")
shr_MA <- MA_fn(shr_resList, "Shrinkage")

Displaying MA plots

  • x-axis: expression level (baseMean))

  • y-axis: fold change (log2FoldChange)

  • Red dashed lines: log2FoldChange = -1 and 1

  • Upper: TPM with (right) or without (left) shrinkage

  • Lower: Counts with (right) or without (left) shrinkage

# TPM with or without shrinkage
grid.arrange(MA[[1]], shr_MA[[1]], nrow=1)

# TPM with or without shrinkage
grid.arrange(MA[[2]], shr_MA[[2]], nrow=1)

Results & Discussion 4:

- Both inputs gave fairly identical MA plots

Exploring distribution of false discovery rate (FDR)

  • Only the shrunken results are taken for further analysis

  • Distribution of adjusted p-val (FDR) was presented

  • x-axis: FDR

  • y-axis: Number of genes

  • Black dashed line: FDR = 0.1

# Combining total data table 
res <- rbind(shr_resList[['TPM']], shr_resList[['Counts']])
res$Input <- factor(res$Input, levels=TvC)  # TvC=c("TPM", "Counts")
# Create a plot presenting distribution of FDR
ggplot(res,
       aes(x=padj, color=Input)) + 
geom_density(size=1, aes(y=..count..)) + 
theme_bw() +
ggtitle("Distribution of False Discovery Rate (FDR)") + 
xlab("Adjusted P-Value (FDR)") + 
ylab("Count") + 
geom_vline(xintercept=alpha, 
           color="black", 
           linetype="dashed",
           size=1) + 
scale_x_continuous(breaks=seq(0, 1, by=0.1)) 

Results & Discussion 5:

- Both inputs showed highly similar FDR distribution patterns across the FDR range.

- When the FDR > 0.7, more genes were counted in the TPM input than in the Counts.

Volcano plots

  • x-axis: expression level (log2FoldChange)

  • y-axis: log odds (larger log odds = more promising genes)

  • Red dashed lines: log2FoldChange = -1 and 1

# Set xlim for volcano plots
xlim=c(-6, 6)    # has to be assined by users
# Set a basic volcano plot function 
Volcano_fn <- function(df, Label=NULL) {
ggplot(res, 
       aes(x=log2FoldChange,
           y= -log10(padj),
           color=FDR,
           label=Label)) + 
geom_point() +
facet_grid(.~Input) + 
theme_bw() +
scale_color_manual(values=c("blue", "grey")) + 
ggtitle("Volcano Plot") + 
ylab("-log10(padj)") + 
theme(strip.text.x=element_text(size=12)) + 
geom_vline(xintercept=c(mLog[1], mLog[2]), 
           color="red", 
           linetype="dashed", 
           size=1) + 
xlim(xlim[1], xlim[2])
}
# Display the volcano plots by input
Volcano_fn(res)

Volcano plots with promising genes

  • Log odds threshold (y-axis): > 20
# Assign log odds threshold 
LogOddsCut=20 
# Add a column indicating high log odds genes 
res <- res %>% 
    mutate(Label=ifelse(-log10(padj) > LogOddsCut, 
                                   Gene, 
                                   "")) 
# Display the genes with volcano plots
Volcano_fn(res, Label=res$Label) + geom_text_repel(color="black")

Results & Discussion 6:

- Both inputs gave nearly identical volcano plots

Exploring distribution of log2FoldChange by input type

  • Black dashed lines: log2FoldChange = -1 and 1

  • x-axis: gene expression level (log2FoldChange)

  • y-axis: number of genes

ggplot(res[res$FDR == "< 0.1", ],  # Subset rows with FDR < alpha 
       aes(x=log2FoldChange,
           color=Input)) + 
geom_density(size=1, aes(y=..count..)) +
theme_bw() + 
ylab("Count") + 
geom_vline(xintercept=c(mLog[1], mLog[2]), 
           color="black",
           linetype="dashed", size=1) +
ggtitle("Distribution of Log2 Folds by Input Type") + 
xlim(xlim[1], xlim[2])

Results & Discussion 7:

- Both inputs showed nearly identical fold change distribution

Exploring expression profiling with normalized count data

  • Normalized count matrices are extracted from dds objects and filtered with thresholds set at FDR and log2FoldChange

  • The heatmaps display z-scores of the normalized counts

  • lowfdrList: a list of matrices filtered by FDR < alpha

  • highfoldList: a list of matrices filtered by FDR < alpha AND absolute log2FoldChange > user’s minimum threshold (mLog)

  • In this analysis, mLog = 1

  • References: Harvard Chan Bioinformatics Core workshop, DESeq2 doc “Heatmap of the count matrix”

# Initialize a list 
lowfdrList <- ddsList   # A list for normalized counts matrix with FDR below alpha
highfoldList <- ddsList  # A list for normalized counts with log2foldchange over minLog
for (x in TvC) {
    # Create filtering vectors with alpha and log2foldchange
    BelowAlpha <- shr_resList[[x]]$FDR == FDRv[1]
    overmLog <- abs(shr_resList[[x]]$log2FoldChange) > mLog[2]  # mLog has been set to c(-1, 1) previously
    # Extract transformed counts from vsd objects (TPM[['vsd']] and Counts[['vsd']]) 
    if (x == "TPM") {
        normCounts <- counts(TPM[['dds']], normalized=T)
    
    } else {
        normCounts <- counts(Counts[['dds']], normalized=T)
    }
    
    # Update the normalized count matrix with FDR below alpha
    lowfdrList[[x]] <- normCounts[BelowAlpha, ]
    highfoldList[[x]] <- normCounts[BelowAlpha & overmLog, ]
    summary(lowfdrList[[x]])
    summary(highfoldList[[x]])
}
# Initialize map lists 
lowfdrMap <- ddsList
highfoldMap <- ddsList 
# Set a function creating a heatmap
ProfileHeatmap_fn <- function(inputmatrix, Title1, Title2, Title3=NULL) {
    
    as.ggplot(pheatmap(inputmatrix, 
             annotation=HeatmapAnno,
             scale="row",         # presents z-score instead of counts
             show_rownames=F,
             main=paste("Transcription Profiles with", 
                        Title1, 
                        "input and", 
                        Title2, 
                        alpha, 
                        Title3)))
}
# Create and save heatmaps
for (x in TvC) {
    lowfdrMap[[x]] <- ProfileHeatmap_fn(lowfdrList[[x]],
                                        Title1=x, 
                                        Title2="FDR <")
    highfoldMap[[x]] <- ProfileHeatmap_fn(highfoldList[[x]], 
                                          Title1=x, 
                                          Title2="FDR <",
                                          Title3=paste("+ Absolte Log2 Fold Change >", mLog[2])) 
}

Results & Discussion 8:

- No major difference is seen bewteen the TPM and Counts

NA statistics: zero count genes & outlier genes

When NAs appear in

  • log2FoldChange: zero counts in all samples

  • padj: too little information

  • pval & padj: at least one replicate was an outlier

# Count number of NA genes  
type=c("Zero Counts", "Outliers", "Total NA Genes") 
NAstat <- res %>%
    group_by(Input) %>%
    summarize(zero=sum(is.na(log2FoldChange)), 
              outlier=sum(is.na(pvalue) & is.na(padj))) %>%
    mutate(total=sum(zero, outlier)) %>%
    gather(Type, Number, -Input) %>%
    mutate(Type=factor(case_when(Type == "zero" ~ type[1], 
                                 Type == "outlier" ~ type[2], 
                                 Type == "total" ~ type[3]), 
                       levels=type))
# Plot number of NA genes 
ggplot(NAstat, aes(x=Type, y=Number, group=Input, fill=Input, label=Number)) + 
    geom_bar(stat="identity", position="dodge") + 
    theme_bw() +
    geom_text(position=position_dodge(width=1), vjust=1.5) + 
    ggtitle("Number of NA Genes") + 
    ylab("Number of Genes")

Results & Discussion 9:

- Both inputs produced no major difference in the number of NA genes

Ranking DEGs with the TPM and original count inputs

  • FDRrankList: ranked by FDR

  • lfcList: ranked by absolute fold change

  • UPlfcList: ranked by magnitude of fold change increase

  • DOWNlfcList: ranked by manitude of fold change decrease

# Create a new list having DE table with FDR below alpha
lowfdr_resList <- shr_resList 
for (x in TvC) { 
    lowfdr_resList[[x]] <- filter(shr_resList[[x]], 
                                  FDR == FDRv[1]) %>% 
    as.data.table()
}
# Initialize new lists in order to store rank-updated result DE tables 
FDRrankList <- lowfdr_resList
lfcList <- lowfdr_resList
UPlfcList <- lowfdr_resList
DOWNlfcList <- lowfdr_resList
# Set a function creating a column for the rank
Ranking_fn <- function(x) {mutate(x, Rank=1:nrow(x))}
for (x in TvC) { 
    # Rearrange genes with FDR  
    FDRrankList[[x]] <- lowfdr_resList[[x]][order(padj),] %>%
        Ranking_fn()
    # Rearrange genew with absolute log2FoldChange 
    lfcList[[x]] <- lowfdr_resList[[x]][order(-abs(log2FoldChange)),] %>%
        Ranking_fn()
    # Rearrange genes with log2FoldChange (decreasing order)
    UPlfcList[[x]] <- lowfdr_resList[[x]][order(-log2FoldChange),] %>%
        Ranking_fn()
    # Rearrange genes with log2FoldChange (increasing order)
    DOWNlfcList[[x]] <- lowfdr_resList[[x]][order(log2FoldChange),] %>%
        Ranking_fn()
}
# Explore the ranks
print(c(FDRrankList, lfcList, UPlfcList, DOWNlfcList))
## $TPM
##         Gene   baseMean log2FoldChange     lfcSE      stat       pvalue
##   1:    CCN2 3774.26448      3.2928140 0.1897118 17.345356 2.138482e-67
##   2:     F2R 2466.06108      1.9013819 0.1194905 15.908801 5.505788e-57
##   3: TM4SF18 1929.69892      2.0213739 0.1403122 14.401312 5.077035e-47
##   4:   CDCP1 1520.13638      2.6475329 0.1880305 14.067990 5.974381e-45
##   5:   GPR87  559.19270      3.3085620 0.2418261 13.611243 3.433519e-42
##  ---                                                                   
## 482:  P2RY13   73.90787     -0.6994790 0.2832717 -2.467164 1.361881e-02
## 483:   RBM43   32.83603     -0.9008497 0.3646408 -2.465313 1.368935e-02
## 484:   VEGFD   13.48414     -1.0483426 0.4232315 -2.466442 1.364630e-02
## 485:   ZFP36   61.12927      0.7549845 0.3054432  2.468805 1.355651e-02
## 486:  TSPAN8   78.68030     -0.7002037 0.2843908 -2.461475 1.383672e-02
##              padj   FDR Input Rank
##   1: 7.420531e-64 < 0.1   TPM    1
##   2: 9.552543e-54 < 0.1   TPM    2
##   3: 5.872437e-44 < 0.1   TPM    3
##   4: 5.182775e-42 < 0.1   TPM    4
##   5: 2.382862e-39 < 0.1   TPM    5
##  ---                              
## 482: 9.794233e-02 < 0.1   TPM  482
## 483: 9.794233e-02 < 0.1   TPM  483
## 484: 9.794233e-02 < 0.1   TPM  484
## 485: 9.794233e-02 < 0.1   TPM  485
## 486: 9.879304e-02 < 0.1   TPM  486
## 
## $Counts
##          Gene  baseMean log2FoldChange     lfcSE      stat       pvalue
##   1:     CCN2 3817.8820      3.2844202 0.1915393 17.136222 7.966139e-66
##   2:      F2R 2495.7817      1.8984783 0.1186228 16.000704 1.263392e-57
##   3:  TM4SF18 1948.8878      2.0182562 0.1399437 14.417068 4.041495e-47
##   4:    CDCP1 1537.7433      2.6409502 0.1893300 13.936786 3.786111e-44
##   5:    GPR87  565.6044      3.3012898 0.2433261 13.497800 1.611156e-41
##  ---                                                                   
## 494:  PCDHA11  137.2504      0.7695862 0.3152421  2.441691 1.461864e-02
## 495: MTND1P23  796.7407     -0.6066005 0.2485948 -2.439717 1.469877e-02
## 496:    CHAC2  161.8005      0.6276699 0.2574698  2.436761 1.481945e-02
## 497:   MRPL13 3338.7125      0.2996762 0.1230187  2.436026 1.484962e-02
## 498:   TIMM8A  370.7887      0.4324878 0.1775077  2.436345 1.483652e-02
##              padj   FDR  Input Rank
##   1: 2.652724e-62 < 0.1 Counts    1
##   2: 2.103547e-54 < 0.1 Counts    2
##   3: 4.486059e-44 < 0.1 Counts    3
##   4: 3.151937e-41 < 0.1 Counts    4
##   5: 1.073030e-38 < 0.1 Counts    5
##  ---                               
## 494: 9.865956e-02 < 0.1 Counts  494
## 495: 9.888266e-02 < 0.1 Counts  495
## 496: 9.929565e-02 < 0.1 Counts  496
## 497: 9.929565e-02 < 0.1 Counts  497
## 498: 9.929565e-02 < 0.1 Counts  498
## 
## $TPM
##          Gene    baseMean log2FoldChange     lfcSE      stat       pvalue
##   1:   SLC5A3   175.78052     -5.6388461 0.3735942 -8.890554 6.080480e-19
##   2:     EDN1   141.85334      3.4475869 0.3323096 10.072153 7.335473e-24
##   3:    MUC12    36.09282      3.3271176 0.4077041  6.359295 2.026818e-10
##   4:    GPR87   559.19270      3.3085620 0.2418261 13.611243 3.433519e-42
##   5:     CCN2  3774.26448      3.2928140 0.1897118 17.345356 2.138482e-67
##  ---                                                                     
## 482:    KPNA4  3914.88812      0.2943474 0.1086781  2.708425 6.760349e-03
## 483:    CD2AP  3258.51969     -0.2941260 0.1160457 -2.534561 1.125884e-02
## 484: EIF4EBP2  4482.98608      0.2913268 0.1178709  2.471578 1.345182e-02
## 485:      ND1 35661.75864     -0.2884041 0.1035473 -2.785240 5.348811e-03
## 486:     TPMT  2885.58887      0.2844551 0.1143035  2.488592 1.282500e-02
##              padj   FDR Input Rank
##   1: 1.172181e-16 < 0.1   TPM    1
##   2: 2.828232e-21 < 0.1   TPM    2
##   3: 1.562902e-08 < 0.1   TPM    3
##   4: 2.382862e-39 < 0.1   TPM    4
##   5: 7.420531e-64 < 0.1   TPM    5
##  ---                              
## 482: 5.938838e-02 < 0.1   TPM  482
## 483: 8.643403e-02 < 0.1   TPM  483
## 484: 9.765234e-02 < 0.1   TPM  484
## 485: 5.016318e-02 < 0.1   TPM  485
## 486: 9.509136e-02 < 0.1   TPM  486
## 
## $Counts
##          Gene    baseMean log2FoldChange     lfcSE      stat       pvalue
##   1:   SLC5A3   178.16943     -5.6507587 0.3735892 -8.902361 5.467110e-19
##   2:     EDN1   143.44679      3.4520859 0.3321884 10.088629 6.202983e-24
##   3:    GPR87   565.60436      3.3012898 0.2433261 13.497800 1.611156e-41
##   4:     CCN2  3817.88197      3.2844202 0.1915393 17.136222 7.966139e-66
##   5:    MUC12    21.13784      3.2420717 0.4215420  7.009996 2.383243e-12
##  ---                                                                     
## 494:      ND1 36116.54615     -0.2919487 0.1030042 -2.834337 4.592082e-03
## 495:    KPNA4  3961.88892      0.2907356 0.1076690  2.700261 6.928505e-03
## 496: EIF4EBP2  4539.93490      0.2878481 0.1178920  2.441629 1.462115e-02
## 497:     TPMT  2922.37985      0.2810499 0.1138352  2.468918 1.355223e-02
## 498:  IER3IP1  6644.69103     -0.2681817 0.1092941 -2.453759 1.413716e-02
##              padj   FDR  Input Rank
##   1: 1.011415e-16 < 0.1 Counts    1
##   2: 2.295104e-21 < 0.1 Counts    2
##   3: 1.073030e-38 < 0.1 Counts    3
##   4: 2.652724e-62 < 0.1 Counts    4
##   5: 2.088473e-10 < 0.1 Counts    5
##  ---                               
## 494: 4.224208e-02 < 0.1 Counts  494
## 495: 5.840993e-02 < 0.1 Counts  495
## 496: 9.865956e-02 < 0.1 Counts  496
## 497: 9.382315e-02 < 0.1 Counts  497
## 498: 9.666685e-02 < 0.1 Counts  498
## 
## $TPM
##          Gene   baseMean log2FoldChange     lfcSE      stat       pvalue
##   1:     EDN1  141.85334       3.447587 0.3323096 10.072153 7.335473e-24
##   2:    MUC12   36.09282       3.327118 0.4077041  6.359295 2.026818e-10
##   3:    GPR87  559.19270       3.308562 0.2418261 13.611243 3.433519e-42
##   4:     CCN2 3774.26448       3.292814 0.1897118 17.345356 2.138482e-67
##   5: SERPINE1 8982.01227       2.857580 0.3273250  8.752297 2.090532e-18
##  ---                                                                    
## 482:   TRMT9B  145.09703      -1.725347 0.3138013 -5.481822 4.209682e-08
## 483:  WFIKKN2  368.31877      -1.904874 0.2350517 -8.090961 5.919605e-16
## 484:   GPR171  242.54169      -2.103719 0.2436843 -8.604146 7.688684e-18
## 485:  HLA-DRA  273.70898      -2.141204 0.2510380 -8.510174 1.736718e-17
## 486:   SLC5A3  175.78052      -5.638846 0.3735942 -8.890554 6.080480e-19
##              padj   FDR Input Rank
##   1: 2.828232e-21 < 0.1   TPM    1
##   2: 1.562902e-08 < 0.1   TPM    2
##   3: 2.382862e-39 < 0.1   TPM    3
##   4: 7.420531e-64 < 0.1   TPM    4
##   5: 3.817971e-16 < 0.1   TPM    5
##  ---                              
## 482: 2.001040e-06 < 0.1   TPM  482
## 483: 8.216411e-14 < 0.1   TPM  483
## 484: 1.333987e-15 < 0.1   TPM  484
## 485: 2.739278e-15 < 0.1   TPM  485
## 486: 1.172181e-16 < 0.1   TPM  486
## 
## $Counts
##          Gene   baseMean log2FoldChange     lfcSE      stat       pvalue
##   1:     EDN1  143.44679       3.452086 0.3321884 10.088629 6.202983e-24
##   2:    GPR87  565.60436       3.301290 0.2433261 13.497800 1.611156e-41
##   3:     CCN2 3817.88197       3.284420 0.1915393 17.136222 7.966139e-66
##   4:    MUC12   21.13784       3.242072 0.4215420  7.009996 2.383243e-12
##   5: SERPINE1 9084.96822       2.853314 0.3275487  8.733282 2.473876e-18
##  ---                                                                    
## 494:   TRMT9B  145.58638      -1.733195 0.3137029 -5.514008 3.507534e-08
## 495:  WFIKKN2  373.25309      -1.906352 0.2355697 -8.079008 6.529557e-16
## 496:   GPR171  210.73283      -2.093395 0.2444487 -8.557681 1.151604e-17
## 497:  HLA-DRA  276.91753      -2.141394 0.2514838 -8.494088 1.994923e-17
## 498:   SLC5A3  178.16943      -5.650759 0.3735892 -8.902361 5.467110e-19
##              padj   FDR  Input Rank
##   1: 2.295104e-21 < 0.1 Counts    1
##   2: 1.073030e-38 < 0.1 Counts    2
##   3: 2.652724e-62 < 0.1 Counts    3
##   4: 2.088473e-10 < 0.1 Counts    4
##   5: 4.335793e-16 < 0.1 Counts    5
##  ---                               
## 494: 1.622235e-06 < 0.1 Counts  494
## 495: 8.697370e-14 < 0.1 Counts  495
## 496: 1.917421e-15 < 0.1 Counts  496
## 497: 3.019589e-15 < 0.1 Counts  497
## 498: 1.011415e-16 < 0.1 Counts  498
## 
## $TPM
##          Gene   baseMean log2FoldChange     lfcSE      stat       pvalue
##   1:   SLC5A3  175.78052      -5.638846 0.3735942 -8.890554 6.080480e-19
##   2:  HLA-DRA  273.70898      -2.141204 0.2510380 -8.510174 1.736718e-17
##   3:   GPR171  242.54169      -2.103719 0.2436843 -8.604146 7.688684e-18
##   4:  WFIKKN2  368.31877      -1.904874 0.2350517 -8.090961 5.919605e-16
##   5:   TRMT9B  145.09703      -1.725347 0.3138013 -5.481822 4.209682e-08
##  ---                                                                    
## 482: SERPINE1 8982.01227       2.857580 0.3273250  8.752297 2.090532e-18
## 483:     CCN2 3774.26448       3.292814 0.1897118 17.345356 2.138482e-67
## 484:    GPR87  559.19270       3.308562 0.2418261 13.611243 3.433519e-42
## 485:    MUC12   36.09282       3.327118 0.4077041  6.359295 2.026818e-10
## 486:     EDN1  141.85334       3.447587 0.3323096 10.072153 7.335473e-24
##              padj   FDR Input Rank
##   1: 1.172181e-16 < 0.1   TPM    1
##   2: 2.739278e-15 < 0.1   TPM    2
##   3: 1.333987e-15 < 0.1   TPM    3
##   4: 8.216411e-14 < 0.1   TPM    4
##   5: 2.001040e-06 < 0.1   TPM    5
##  ---                              
## 482: 3.817971e-16 < 0.1   TPM  482
## 483: 7.420531e-64 < 0.1   TPM  483
## 484: 2.382862e-39 < 0.1   TPM  484
## 485: 1.562902e-08 < 0.1   TPM  485
## 486: 2.828232e-21 < 0.1   TPM  486
## 
## $Counts
##          Gene   baseMean log2FoldChange     lfcSE      stat       pvalue
##   1:   SLC5A3  178.16943      -5.650759 0.3735892 -8.902361 5.467110e-19
##   2:  HLA-DRA  276.91753      -2.141394 0.2514838 -8.494088 1.994923e-17
##   3:   GPR171  210.73283      -2.093395 0.2444487 -8.557681 1.151604e-17
##   4:  WFIKKN2  373.25309      -1.906352 0.2355697 -8.079008 6.529557e-16
##   5:   TRMT9B  145.58638      -1.733195 0.3137029 -5.514008 3.507534e-08
##  ---                                                                    
## 494: SERPINE1 9084.96822       2.853314 0.3275487  8.733282 2.473876e-18
## 495:    MUC12   21.13784       3.242072 0.4215420  7.009996 2.383243e-12
## 496:     CCN2 3817.88197       3.284420 0.1915393 17.136222 7.966139e-66
## 497:    GPR87  565.60436       3.301290 0.2433261 13.497800 1.611156e-41
## 498:     EDN1  143.44679       3.452086 0.3321884 10.088629 6.202983e-24
##              padj   FDR  Input Rank
##   1: 1.011415e-16 < 0.1 Counts    1
##   2: 3.019589e-15 < 0.1 Counts    2
##   3: 1.917421e-15 < 0.1 Counts    3
##   4: 8.697370e-14 < 0.1 Counts    4
##   5: 1.622235e-06 < 0.1 Counts    5
##  ---                               
## 494: 4.335793e-16 < 0.1 Counts  494
## 495: 2.088473e-10 < 0.1 Counts  495
## 496: 2.652724e-62 < 0.1 Counts  496
## 497: 1.073030e-38 < 0.1 Counts  497
## 498: 2.295104e-21 < 0.1 Counts  498

Comparing DEG ranks between TPM- and Counts-inputted DE analysis

# Set a function rebuilding DE tables with gene ranks 
combineTable_fn <- function(List){
    # Select columns and join the data frames by gene
    full_join(List[[TvC[1]]][,.(Gene, Input, Rank, baseMean)], 
              List[[TvC[2]]][,.(Gene, Input, Rank, baseMean)], by="Gene") %>%
    
    # Add columns assining gene expression levels, rank differences, and mean ranks
    mutate(logMeanExpression=log(baseMean.x+baseMean.y/2),
           RankDiff=Rank.x-Rank.y, 
           MeanRank=(Rank.x+Rank.y)/2)
} 
# Explore outputs of the function
head(combineTable_fn(FDRrankList))
##       Gene Input.x Rank.x baseMean.x Input.y Rank.y baseMean.y
## 1:    CCN2     TPM      1  3774.2645  Counts      1  3817.8820
## 2:     F2R     TPM      2  2466.0611  Counts      2  2495.7817
## 3: TM4SF18     TPM      3  1929.6989  Counts      3  1948.8878
## 4:   CDCP1     TPM      4  1520.1364  Counts      4  1537.7433
## 5:   GPR87     TPM      5   559.1927  Counts      5   565.6044
## 6:   F2RL2     TPM      6  3010.5527  Counts      6  3047.3796
##    logMeanExpression RankDiff MeanRank
## 1:          8.645271        0        1
## 2:          8.219852        0        2
## 3:          7.973894        0        3
## 4:          7.735874        0        4
## 5:          6.735774        0        5
## 6:          8.419413        0        6
dim(combineTable_fn(FDRrankList))
## [1] 503  10
tail(combineTable_fn(FDRrankList))
##        Gene Input.x Rank.x baseMean.x Input.y Rank.y baseMean.y
## 1:    KCTD7    <NA>     NA         NA  Counts    492   248.5733
## 2:  PCDHA11    <NA>     NA         NA  Counts    494   137.2504
## 3: MTND1P23    <NA>     NA         NA  Counts    495   796.7407
## 4:    CHAC2    <NA>     NA         NA  Counts    496   161.8005
## 5:   MRPL13    <NA>     NA         NA  Counts    497  3338.7125
## 6:   TIMM8A    <NA>     NA         NA  Counts    498   370.7887
##    logMeanExpression RankDiff MeanRank
## 1:                NA       NA       NA
## 2:                NA       NA       NA
## 3:                NA       NA       NA
## 4:                NA       NA       NA
## 5:                NA       NA       NA
## 6:                NA       NA       NA
# Set a function plotting gene ranks between TPM- (x-axis) and Counts-Inputs (y-axis)
compareRanks_fn <- function(df, rankedby) {
ggplot(df, 
       aes(x=Rank.x, 
           y=Rank.y,
           color=logMeanExpression)) +
geom_point(alpha=0.5) + 
theme_bw() + 
xlab("Rank with TPM") + 
ylab("Rank with Counts") + 
geom_abline(slope=1, color="black", size=0.5) + 
ggtitle(paste(rankedby, "Ranking witn TPM vs Counts Inputs")) + 
scale_color_gradient(low="blue", high="red")
}
   
# Set a function plotting the rank difference over the gene expression level
RankdiffOverMean_fn <- function(df, rankedby) {
ggplot(df, 
       aes(x=logMeanExpression, 
           y=RankDiff,
           color=MeanRank)) +
geom_point(alpha=0.5) + 
theme_bw() + 
ylab("Rank Difference (TPM - Counts)") +
ggtitle(paste("Rank Difference Inputs (TPM - Counts) in", rankedby)) + 
geom_hline(yintercept=0, color="black", size=0.5) + 
scale_color_gradient(low="blue", high="red")
}

Visualizing DEG ranks I: TPM- vs Counts-input

  • x-axis: rank with TPM input

  • y-axis: rank with Counts input

  • Black diagonal lines: rank with TPM = rank with Counts

  • Dot color: gene expression level (log-baseMean)

# Ranked by FDR
compareRanks_fn(combineTable_fn(FDRrankList), 
                "FDR")

# Ranked by absolute fold change 
compareRanks_fn(combineTable_fn(lfcList), 
                "Absolute Log2FoldChange")

# Ranked by magnitude of positive fold change
compareRanks_fn(combineTable_fn(UPlfcList), 
                "Log2FoldChange (Increased)")

# Ranked by magnitude of negative fold change
compareRanks_fn(combineTable_fn(DOWNlfcList), 
                "Log2FoldChange (Decreased)")

Results & Discussion 10:

- FDR ranking (top): Genes with high expression levels (red dots) were ranked similarly between two input types which is close to the black diagonal line. Ranking dissimilarity increased in lower ranking ranges (250-500).

- Absolute fold ranking (2nd top): Genes were ranked higher as their expression levels were lower (blue to purple dots). Genes with high expression levels (red dots) showed less difference in the rankings which is close to the black diagonal line.

- Decrease/increase fold ranking (bottom/2nd botton): Highly expressed genes (red dots) showed middle-rankings (150-350). Low-ranking genes tended to be ranked higher with TPM than with Counts as the slope was higher than 1 which is indicated by the black diagonal line.

Visualizing DEG ranks II: Relationship between gene expression level and rank difference

  • x-axis: expression level (log-baseMean)

  • y-axis: rank difference (rank with TPM - rank with Counts)

  • Black horizontal lines: rank with TPM = rank with Counts

  • Dot color: average rank

# Ranked by FDR
RankdiffOverMean_fn(combineTable_fn(FDRrankList), 
                "FDR")

# Ranked by absolute fold change 
RankdiffOverMean_fn(combineTable_fn(lfcList), 
                "Absolute Log2FoldChange")

# Ranked by magnitude of positive fold change
RankdiffOverMean_fn(combineTable_fn(UPlfcList), 
                "Log2FoldChange (Increased)")

# Ranked by magnitude of negative fold change
RankdiffOverMean_fn(combineTable_fn(DOWNlfcList), 
                "Log2FoldChange (Decreased)")

Results & Discussion 11:

- Difference in FDR ranking (top): High-ranking genes (blue dots) showed less difference. Genes with high expression levels (higher x-values) showed less difference.

- Difference in absolute/increase fold ranking (2nd top/2nd bottom): Low mean ranks (red dots) seemed to be coupled with higher ranking with TPM (negative y-values) than that with Counts while high mean ranks (purple to blue dots) seemed vice versa or random.

- Difference in decrease fold ranking (bottom): Low-to-middle ranking genes (red-purple dots) tended to have higher ranking with TPM (negative y-values) whereas high ranking genes (blue dots) showed higher ranking with Counts (positive y-values)

Summarizing up/down DEGs with an upset plot

  • Calculate the number of genes
# Clean data to generate an upset plot
res <- res %>%
    # Filter genes with valid padj 
    filter(!is.na(padj)) %>% 
    # Detect genes which are up/down/unchanged change patterns in either TPM and Counts inputs
    mutate(Up=ifelse(FDR == FDRv[1] & log2FoldChange > 0, Gene, ""), # What are upregulated genes? 
           Down=ifelse(FDR == FDRv[1] & log2FoldChange < 0, Gene, ""),  # What are downregulated genes? 
           Unchanged=ifelse(FDR == FDRv[2], Gene, ""),   # What are unchanged genes? 
           TPM_Input=ifelse(Input == "TPM", Gene, ""),   # What are the genes from TPM input? 
           Counts_Input=ifelse(Input == "Counts", Gene, ""))   # What are the genes from Counts input?
# Create a list storing groups of interest
upsetInput <- list(Up=res$Up, 
                   Down=res$Down, 
                   Unchanged=res$Unchanged, 
                   TPM_Input=res$TPM, 
                   Counts_Input=res$Counts)
# Create the upset plot 
upset(fromList(upsetInput), 
      sets.x.label="Number of Genes per Group", 
      order.by="freq") 

Session Info

sessionInfo()
## R version 4.0.2 (2020-06-22)
## Platform: x86_64-conda_cos6-linux-gnu (64-bit)
## Running under: Ubuntu 20.04.1 LTS
## 
## Matrix products: default
## BLAS/LAPACK: /home/mira/miniconda3/envs/salmon/lib/libopenblasp-r0.3.10.so
## 
## locale:
##  [1] LC_CTYPE=en_US.UTF-8       LC_NUMERIC=C              
##  [3] LC_TIME=en_US.UTF-8        LC_COLLATE=en_US.UTF-8    
##  [5] LC_MONETARY=en_US.UTF-8    LC_MESSAGES=en_US.UTF-8   
##  [7] LC_PAPER=en_US.UTF-8       LC_NAME=C                 
##  [9] LC_ADDRESS=C               LC_TELEPHONE=C            
## [11] LC_MEASUREMENT=en_US.UTF-8 LC_IDENTIFICATION=C       
## 
## attached base packages:
## [1] stats4    parallel  stats     graphics  grDevices utils     datasets 
## [8] methods   base     
## 
## other attached packages:
##  [1] AnnotationDbi_1.52.0        UpSetR_1.4.0               
##  [3] ggrepel_0.8.2               ggplotify_0.0.5            
##  [5] gridExtra_2.3               pheatmap_1.0.12            
##  [7] DESeq2_1.30.0               SummarizedExperiment_1.20.0
##  [9] Biobase_2.50.0              MatrixGenerics_1.2.0       
## [11] matrixStats_0.57.0          GenomicRanges_1.42.0       
## [13] GenomeInfoDb_1.26.0         IRanges_2.24.0             
## [15] S4Vectors_0.28.0            tximport_1.18.0            
## [17] forcats_0.5.0               stringr_1.4.0              
## [19] dplyr_1.0.2                 purrr_0.3.4                
## [21] readr_1.4.0                 tidyr_1.1.2                
## [23] tibble_3.0.4                ggplot2_3.3.2              
## [25] tidyverse_1.3.0             AnnotationHub_2.22.0       
## [27] BiocFileCache_1.14.0        dbplyr_2.0.0               
## [29] BiocGenerics_0.36.0         rmarkdown_2.5              
## [31] data.table_1.13.2          
## 
## loaded via a namespace (and not attached):
##  [1] colorspace_2.0-0              ellipsis_0.3.1               
##  [3] XVector_0.30.0                fs_1.5.0                     
##  [5] rstudioapi_0.13               farver_2.0.3                 
##  [7] bit64_4.0.5                   interactiveDisplayBase_1.28.0
##  [9] fansi_0.4.1                   lubridate_1.7.9              
## [11] xml2_1.3.2                    splines_4.0.2                
## [13] geneplotter_1.68.0            knitr_1.30                   
## [15] jsonlite_1.7.1                broom_0.7.2                  
## [17] annotate_1.68.0               shiny_1.5.0                  
## [19] BiocManager_1.30.10           compiler_4.0.2               
## [21] httr_1.4.2                    rvcheck_0.1.8                
## [23] backports_1.2.0               assertthat_0.2.1             
## [25] Matrix_1.2-18                 fastmap_1.0.1                
## [27] cli_2.1.0                     later_1.1.0.1                
## [29] htmltools_0.5.0               tools_4.0.2                  
## [31] gtable_0.3.0                  glue_1.4.2                   
## [33] GenomeInfoDbData_1.2.4        rappdirs_0.3.1               
## [35] Rcpp_1.0.5                    cellranger_1.1.0             
## [37] vctrs_0.3.4                   xfun_0.19                    
## [39] ps_1.4.0                      rvest_0.3.6                  
## [41] mime_0.9                      lifecycle_0.2.0              
## [43] XML_3.99-0.3                  zlibbioc_1.36.0              
## [45] scales_1.1.1                  hms_0.5.3                    
## [47] promises_1.1.1                RColorBrewer_1.1-2           
## [49] yaml_2.2.1                    curl_4.3                     
## [51] memoise_1.1.0                 stringi_1.4.6                
## [53] RSQLite_2.2.1                 BiocVersion_3.12.0           
## [55] genefilter_1.72.0             BiocParallel_1.24.0          
## [57] rlang_0.4.8                   pkgconfig_2.0.3              
## [59] bitops_1.0-6                  evaluate_0.14                
## [61] lattice_0.20-41               labeling_0.4.2               
## [63] bit_4.0.4                     tidyselect_1.1.0             
## [65] plyr_1.8.6                    magrittr_1.5                 
## [67] R6_2.5.0                      generics_0.1.0               
## [69] DelayedArray_0.16.0           DBI_1.1.0                    
## [71] pillar_1.4.6                  haven_2.3.1                  
## [73] withr_2.3.0                   survival_3.2-7               
## [75] RCurl_1.98-1.2                modelr_0.1.8                 
## [77] crayon_1.3.4                  locfit_1.5-9.4               
## [79] grid_4.0.2                    readxl_1.3.1                 
## [81] blob_1.2.1                    reprex_0.3.0                 
## [83] digest_0.6.27                 xtable_1.8-4                 
## [85] httpuv_1.5.4                  gridGraphics_0.5-0           
## [87] munsell_0.5.0